###################################
# COMPILES THE DATASET
###################################
# partially based on code written by Christoph Nolte
# questions to bowydenbraber@gmail.com



library(foreign)
library(DescTools)

# Load census units. Keep area information
cu <- read.dbf(ff("CU00M.dbf"))[c(1,4)]
cu.id <- "CU_ID_50M" # 50 km2
colnames(cu) <- c(cu.id, "area")


##############
# CENSUS DATA
##############

# FIRST STEP: Map data saved at the year-2000 census tract level to census units

# Build converter
cc <- read.dbf(ff("Census_spatial/CT_CU00_M50.dbf"))
cc$CODIGO <- as.character(cc$CODIGO)
codigos <- NULL
cu.ids <- NULL

# Split up rows/shapes that cover multiple census tracts
# Explanation: Some year-2000 census tract shapes cover several census tracts
# If so, their codes have 20 characters, and the last indicate a range (e.g. "1234-1236")
# Here we generate a separate row for each code contained in that range
for(i in 1:nrow(cc)) {
  ccr <- cc[i,]
  if(nchar(ccr$CODIGO)==20) {
    root <- substr(ccr$CODIGO,1,11)
    c.start <- substr(ccr$CODIGO,12,15)
    c.end <- substr(ccr$CODIGO,17,20)
    full <- paste(root,as.character(formatC(as.numeric(c.start):as.numeric(c.end), width=4, flag=0)),sep="")
  } else full <- ccr$CODIGO
  codigos <- c(codigos,full)
  cu.ids <- c(cu.ids,rep(ccr$CU_ID,length(full)))
}
conv <- data.frame(ct00id=codigos,cu00id=cu.ids)
conv$ct00id <- as.numeric(levels(conv$ct00id)[conv$ct00id])


# Map data from year-2000 census tracts to census units

# Load data from the year-2000 census
d <- read.csv("BraCens2000 - Key Data.csv")
d.conv <- merge(conv,d,by.x="ct00id",by.y="code")
colnames(d.conv) <- c("ct00id","cu00id",paste(colnames(d.conv)[3:ncol(d.conv)],".00",sep=""))

# Add data from the year-2010 census that has already been converted (by area)

d <- read.csv("CU00_Bracens2010.ineq_rend.csv",sep=",")
cu <- merge(cu,d[,-which(colnames(d)=="area.10")],by.x=cu.id,by.y="cu00id", all=TRUE)





# THIRD STEP: Compute key indicators of interest for year-2000 and year-2010 census

# a) Census-2000 data

# % literate households heads 2000
cu$p.hhh.lit.00 <- cu$hhh.lit.00/cu$hhh.REF.00
# % households with poor sanitation 2000
cu$p.hh.bs.poor.00 <- cu$hh.bs.poor.00/cu$hh.REF.00
# average household income 2000
cu$hhh.inc.avg.00 <- cu$hhh.inc.tot.00/cu$hhh.REF.00
cu$hhh.inc.avg.00_infl<-cu$hhh.inc.avg.00*1.871102 # based on http://www.ipeadata.gov.br

# b) Census-2010 data

# average household income 2010
cu$hhh.inc.avg.10 <- cu$hhh.inc.tot.10/cu$hhh.REF.10
# % literate households heads 2010
cu$p.hhh.lit.10 <- cu$hhh.lit.10/cu$hhh.REF.10
# % households with poor sanitation 2010
cu$p.hh.bs.poor.10 <- cu$hh.bs.poor.10/cu$hh.REF.10

# Calculate gini
midpoints <- c(0,0.25,0.75,1.5,2.5,4,7.5,12.5,17.5,20)
incomeclasses00<-c("hhh.inc.no.00","hhh.inc.half.00","hhh.inc.full.00","hhh.inc.full_2.00","hhh.inc.2_3.00","hhh.inc.3_5.00","hhh.inc.5_10.00","hhh.inc.10_15.00","hhh.inc.15_20.00","hhh.inc.20.00")
incomeclasses10<-c("hhh.inc.no.10","hhh.inc.half.10","hhh.inc.full.10","hhh.inc.full_2.10","hhh.inc.2_3.10","hhh.inc.3_5.10","hhh.inc.5_10.10","hhh.inc.10_15.10","hhh.inc.15_20.10","hhh.inc.20.10")

ginis00<-data.frame(matrix(ncol=3,nrow=0))
colnames(ginis00)<-c("gini","lwr.ci","upr.ci")
for(i in 1:length(cu)){
  w<-as.numeric(cu[i,incomeclasses00])
  ginivalues00<-DescTools::Gini(x=midpoints, weights=w, conf.level=0.95, unbiased=FALSE)
  ginis00<-rbind(ginis00,ginivalues00)
}
colnames(ginis00)<-c("gini.00","lwr.ci.00","upr.ci.00")

ginis10<-data.frame(matrix(ncol=3,nrow=0))
colnames(ginis10)<-c("gini","lwr.ci","upr.ci")
for(i in 1:length(cu)){
  w<-as.numeric(cu[i,incomeclasses10])
  ginivalues10<-DescTools::Gini(x=midpoints, weights=w, conf.level=0.95, unbiased=FALSE)
  ginis10<-rbind(ginis10,ginivalues10)
}
colnames(ginis10)<-c("gini.10","lwr.ci.10","upr.ci.10")


cbind(cu,ginis00)
cbind(cu,ginis10)


######################################################
# Add municipality-level indicators
######################################################

conv.mun <- conv
conv.mun$mun <- substr(conv.mun$ct00id,1,7)
conv.mun <- conv.mun[,2:3]
conv.mun <- unique(conv.mun)

mun <- read.csv(ff("Census_data/MunicipalitiesFull.csv"))[,c("CODE", "area", "pop00", "pop", "pop09")]
mun$state <- floor(mun$CODE/100000)

colnames(mun) <- paste("mun.",colnames(mun),sep="")
conv.mun <- merge(conv.mun,mun,by.x="mun",by.y="mun.CODE",all.x=TRUE)
cu <- merge(cu,conv.mun,by.x=cu.id,by.y="cu00id")



######################################################
# Add indicators compiled from spatial datasets
######################################################

#add prodes at baseline
prodes<-read.csv("prodes_forestcover2000.csv")
prodes_r <- prodes %>% dplyr::select(CU_ID_50M,forest_mean)
cu <- merge(cu,prodes_r,by.x=cu.id,by.y=CU_ID_50M, all=TRUE)

#add travel time
d <- read.dbf(paste(fld,"CU00_traveltime_standard.dbf",sep=""))
d <- d[,c(cu.id, "MEAN")]
colnames(d) <- c(cu.id,"traveltime")
cu <- merge(cu,d,by.x=cu.id,by.y=cu.id, all=TRUE)


# Legal Amazon
d <- read.dbf(paste(fld,"CU00_AL.dbf",sep=""))
d <- d[,c(cu.id, "MEAN")]
colnames(d) <- c(cu.id,"al")
cu <- merge(cu,d,by.x=cu.id,by.y=cu.id, all=TRUE)
#reduce to Legal Amazon (90% threshold)
cu <- cu[cu$al>0.9,]

# Floodability
d <- read.dbf(paste(fld,"CU00_floodability.dbf",sep=""))
d <- d[,c(cu.id, "MEAN")]
colnames(d) <- c(cu.id,"floodable")
cu <- merge(cu,d,by.x=cu.id,by.y=cu.id, all=TRUE)

# Slope
d <- read.dbf(paste(fld,"CU00_slope.dbf",sep=""))
d <- d[,c(cu.id, "MEAN")]
colnames(d) <- c(cu.id,"slope")
cu <- merge(cu,d,by.x=cu.id,by.y=cu.id, all=TRUE)

#Hansen:
treecover_2010<-read.csv("Hansen_2010_treecover.csv")
cu <- merge(cu,treecover_2010,by.x=cu.id,by.y=CU_ID_50M, all=TRUE)

#agricultural suitability
agrsuit<-read.csv("agrsuit_tab.csv")
agrsuit$agrsuit<-agrsuit$average-1
cu <- merge(cu,agrsuit[c(1,7)],by.x=cu.id,by.y=CU_ID_50M, all=TRUE)

#mining
minesincu<-read.csv("minesincu00_2607_tab.csv")
#remove agua mineral
mines_withoutwater<-filter(minesincu,SUBS!="?GUA MINERAL")
mines_withoutwater$mines_ano<-ifelse(mines_withoutwater$ANO<2000,1,0)
mines_withoutwater$mineperc_before_2000<-ifelse(mines_withoutwater$mines_ano>0,mines_withoutwater$PERCENTAGE,0)
mines_withoutwater$mineperc_after_2000<-ifelse(mines_withoutwater$mines_ano<1,mines_withoutwater$PERCENTAGE,0)

#mines per CT
mines_all<-summarise(group_by(mines_withoutwater,CU_ID_50M),mineperc_before2000=sum(mineperc_before_2000),mineperc_after2000=sum(mineperc_after_2000))

#presence/absence of mines:
mines_all$mine_presabs_before2000<-ifelse(mines_all$mineperc_before2000>0,1,0)
mines_all$mine_presabs_after2000<-ifelse(mines_all$mineperc_after2000>0,1,0)

#mining buffers
mine_5k_after<-read.csv("cumines_5k_normal.csv")
mine_5k_before<-read.csv("cumines_5k_normal_before2000.csv")
mine_25k_after<-read.csv("cumines_25kafter2000.csv")
mine_25k_before<-read.csv("cumines_25kbefore2000.csv")
mine_50k_after<-read.csv("cumines_50kafter2000.csv")
mine_50k_before<-read.csv("cumines_50kbefore2000.csv")
mine_75k_after<-read.csv("cumines_75kafter2000.csv")
mine_75k_before<-read.csv("cumines_75kbefore2000.csv")
mine_10k_after<-read.csv("cumines_10kafter2000.csv")
mine_10k_before<-read.csv("cumines_10kbefore2000.csv")
mine_100k_exp<-read.csv("mines_100k_exp.csv")
mine_100k_pres<-read.csv("mines_100k_pres.csv")

#merge mining to the main file
cu<-merge(cu,mines_all,by.x=cu.id,by.y="CU_ID_50M",all.x=TRUE)

cu["mine_presabs_before2000"][is.na(cu["mine_presabs_before2000"])]<-0
cu["mine_presabs_after2000"][is.na(cu["mine_presabs_after2000"])]<-0
cu["mineperc_before2000"][is.na(cu["mineperc_before2000"])]<-0
cu["mineperc_after2000"][is.na(cu["mineperc_after2000"])]<-0

#flooding from global land cover
glob1<-read.csv("glob_flooded_mask_data.csv")
glob1$flooded<-glob1$landcover
cu <- merge(cu,glob1,by.x=cu.id,by.y=CU_ID_50M, all=TRUE)

#elevation
elevation<-read.csv("srtm_data_19dec22.csv")
elevation$elevation<-elevation$mean
cu <- merge(cu,elevation,by.x=cu.id,by.y=CU_ID_50M, all=TRUE)



# All UC coverages
d <- read.dbf(paste(fld,"CU00_UC_coverage.dbf",sep=""))
d <- d[,c(1,4:length(colnames(d)))]
colnames(d)[1] <- c(cu.id)
cu <- merge(cu,d,by.x=cu.id,by.y=cu.id, all=TRUE)

write.csv(cu, "CU_BASEDATA.csv")

